package com.shz.appletsapi.service.webmagic.mine;

import java.util.ArrayList;
import java.util.List;

import com.shz.appletsapi.model.po.News;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;


public class OnFxhPNPageProcessor implements PageProcessor {

	private Site site = Site
			.me()
			.setRetryTimes(3)
			.setSleepTime(1000)
			.setTimeOut(10*1000);
	
	private static String URL ="^((?!notice).)*$";
    @Override
    public void process(Page page) {
    	List<String> pages=page.getHtml().xpath("//div[@class='new-side-box']/ul//a").links().all();
    	pages.stream().forEach(System.out::println);
    	System.out.println(pages.stream().count());
    	page.addTargetRequests(pages);
    	if(page.getUrl().regex(URL).match()) {
    		System.out.println("++++");
    	}else {
    		
    		News n =new News();
    		//标题
    		n.setTitle(page.getHtml().xpath("//div[@class='artBox article']/h1/text()").toString().trim());
    		//logo
    		n.setLogo(page.getHtml().xpath("//div[@class='artBox article']/div[1]/img/@src").toString());		
    		//原文链接
    		n.setOriginalLink(page.getHtml().xpath("//div[@class='artBox article']/div[1]/span/a/@href").toString());
    		
    		String[] content=page.getHtml().xpath("//div[@class='artBox article']/div[1]/text()").toString().split("发布于");
    		n.setAuthor(content[0].trim());
    		n.setPublishTime(content[1].trim());
    		//原文
    		n.setOriginalContent(page.getHtml().xpath("//div[@class='artBox article']/allText()").toString());
    		page.putField("news", n);
    	}
    		
    }

    @Override
    public Site getSite() {
        return site;
    }
    
    
    
    
/*    public static void main(String[] args) {

        Spider.create(new OnFxhPNPageProcessor())
                //从"https://github.com/code4craft"开始抓
                .addUrl("https://www.feixiaohao.com/")
                //开启5个线程抓取
                .thread(1)
                //启动爬虫
                .run();
    }*/
}
